Import all libraries used.
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error
import numpy as np
import os
import plotly.graph_objs as go
import plotly.offline as pyo
from prophet import Prophet
Set plot style for matplotlib.
plt.style.use('fivethirtyeight')
def decompose_annual(series):
annual = seasonal_decompose(series, model='additive', period=8760)
return pd.DataFrame({'Observed' : annual.observed, 'Trend' : annual.trend,
'Seasonal': annual.seasonal, 'Residual' : annual.resid})
def decompose_daily(series):
daily = seasonal_decompose(series, model='additive', period = 24)
return pd.DataFrame({'Observed' : daily.observed, 'Trend' : daily.trend,
'Seasonal': daily.seasonal, 'Residual' : daily.resid})
def decompose_useful(df):
"""
Takes in the data for a station and adds the columns I found were useful.
"""
df1 = add_saturation_vapor_pressure(df)
temp_decomp_annual = seasonal_decompose(df['TEMP'], model='additive', period=8760)
temp_decomp_daily = seasonal_decompose(df['TEMP'], model='additive', period=24)
vapor_pressure_decomp_annual = seasonal_decompose(df1['VAPOR_PRESSURE'], model='additive', period=8760)
df1['temp_annual_trend'] = temp_decomp_annual.trend
df1['temp_annual_seasonal'] = temp_decomp_annual.seasonal
df1['temp_annual_resid'] = temp_decomp_annual.resid
df1['temp_daily_trend'] = temp_decomp_daily.trend
df1['temp_daily_seasonal'] = temp_decomp_daily.seasonal
df1['temp_daily_resid'] = temp_decomp_daily.resid
df1['vapor_pressure_annual_trend'] = vapor_pressure_decomp_annual.trend
df1['vapor_pressure_annual_seasonal'] = vapor_pressure_decomp_annual.seasonal
df1['vapor_pressure_annual_resid'] = vapor_pressure_decomp_annual.resid
return df1
def decompose_residual_annual(series):
"""
Takes a series, runs time series decomposition over an annual period, and returns a series with the residuals.
Requires statsmodels.tsa.seasonal
"""
return seasonal_decompose(series, model='additive', period=8760).resid
def decompose_residual_daily(series):
"""
Takes a series, runs time series decomposition over a daily period, and returns a series with the residuals.
Requires statsmodels.tsa.seasonal
"""
return seasonal_decompose(series, model='additive', period=24).resid
Perform all time series decompositions here.
# Directory containing your CSV files
folder_path = '../weather/processed'
# Dictionary to store decomposition results for each file
decomposition_total_results_precip_amount = {}
decomposition_winter_2023_results_precip_amount = {}
decomposition_total_results_temp = {}
decomposition_winter_2023_results_temp = {}
decomposition_total_results_relative_humidity = {}
decomposition_winter_2023_results_relative_humidity = {}
decomposition_total_results_station_pressure = {}
decomposition_winter_2023_results_station_pressure = {}
decomposition_total_results_wind_speed = {}
decomposition_winter_2023_results_wind_speed = {}
decomposition_total_results_vapor_pressure = {}
decomposition_winter_2023_results_vapor_pressure = {}
start_date_winter_2023 = '2023-01-01'
end_date_winter_2023 = '2023-03-31'
# Iterate through all files in the folder
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
# Load the CSV file
data = pd.read_csv(file_path, parse_dates=['UTC_DATE'], index_col='UTC_DATE')
subset = data[(data.index >= start_date_winter_2023) & (data.index <= end_date_winter_2023)]
# Perform time series decomposition - precip_amount
result_precip_amount = seasonal_decompose(data['PRECIP_AMOUNT'], model='additive', period=8760)
result_winter_2023_precip_amount = seasonal_decompose(subset['PRECIP_AMOUNT'], model='additive', period=24)
# Store the decomposition result in the dictionary - precip_amount
decomposition_total_results_precip_amount[file_name] = result_precip_amount
decomposition_winter_2023_results_precip_amount[file_name] = result_winter_2023_precip_amount
# Perform time series decomposition - temp
result_temp = seasonal_decompose(data['TEMP'], model='additive', period=8760)
result_winter_2023_temp = seasonal_decompose(subset['TEMP'], model='additive', period=24)
# Store the decomposition result in the dictionary - temp
decomposition_total_results_temp[file_name] = result_temp
decomposition_winter_2023_results_temp[file_name] = result_winter_2023_temp
# Perform time series decomposition - RELATIVE_HUMIDITY
result_relative_humidity = seasonal_decompose(data['RELATIVE_HUMIDITY'], model='additive', period=8760)
result_winter_2023_relative_humidity = seasonal_decompose(subset['RELATIVE_HUMIDITY'], model='additive', period = 24)
# Store the decomposition result in the dictionary - RELATIVE_HUMIDITY
decomposition_total_results_relative_humidity[file_name] = result_relative_humidity
decomposition_winter_2023_results_relative_humidity[file_name] = result_winter_2023_relative_humidity
# Perform time series decomposition - STATION_PRESSURE
result_station_pressure = seasonal_decompose(data['STATION_PRESSURE'], model='additive', period=8760)
result_winter_2023_station_pressure = seasonal_decompose(subset['STATION_PRESSURE'], model='additive', period = 24)
# Store the decomposition result in the dictionary - STATION_PRESSURE
decomposition_total_results_station_pressure[file_name] = result_station_pressure
decomposition_winter_2023_results_station_pressure[file_name] = result_winter_2023_station_pressure
# Perform time series decomposition - WIND_SPEED
result_wind_speed = seasonal_decompose(data['WIND_SPEED'], model='additive', period=8760)
result_winter_2023_wind_speed = seasonal_decompose(subset['WIND_SPEED'], model='additive', period=24)
# Store the decomposition result in the dictionary - WIND_SPEED
decomposition_total_results_wind_speed[file_name] = result_wind_speed
decomposition_winter_2023_results_wind_speed[file_name] = result_winter_2023_wind_speed
# Perform time series decomposition - VAPOR_PRESSURE
result_vapor_pressure = seasonal_decompose(data['VAPOR_PRESSURE'], model='additive', period=8760)
result_winter_2023_vapor_pressure = seasonal_decompose(subset['VAPOR_PRESSURE'], model='additive', period=24)
# Store the decomposition result in the dictionary - VAPOR_PRESSURE
decomposition_total_results_vapor_pressure[file_name] = result_vapor_pressure
decomposition_winter_2023_results_vapor_pressure[file_name] = result_winter_2023_vapor_pressure
file_name_keys = list(decomposition_total_results_precip_amount.keys())
# # Save the decomposition results to files (Optional)
# for file_name, result in decomposition_results.items():
# output_file = f"decomposition_{file_name}.json" # Change the file format as needed
# result.save(output_file)
The following cell is an example time-series decomposition. This is multi-year temperature data for BARRIE-ORO, with a seasonality period of 8760 hours or 365 days.
# Plot the decomposed components
result = decomposition_total_results_temp[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(8, 10), dpi=300)
result.observed.plot(ax=ax1, lw=1)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1)
ax4.set_ylabel('Residual')
plt.tight_layout()
plt.savefig('../../../images/decomp_day_temp.png')
The following cell is another example time-series decomposition. This is temperature data for OSHAWA over the first quarter of 2023, with a seasonality period of 24 hours.
# Plot the decomposed components
result = decomposition_winter_2023_results_temp[file_name_keys[5]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 10))
result.observed.plot(ax=ax1, lw=1.5)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1.5)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1.5)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1.5)
ax4.set_ylabel('Residual')
plt.tight_layout()
The following lists for all stations their lowest correlation coefficient when compared to other stations for observed temperature data.
# Assuming you have multiple trend components stored in a list or dictionary
# For example, trend_components = [result1.trend, result2.trend, result3.trend]
trend_components = [decomposition_total_results_temp[x].observed.dropna() for x in decomposition_total_results_temp]
# Create an empty correlation matrix
num_trends = len(trend_components)
correlation_matrix = np.zeros((num_trends, num_trends))
# Calculate correlation coefficients pairwise
for i in range(num_trends):
for j in range(num_trends):
# Ensure both series have the same length (trim if needed)
min_length = min(len(trend_components[i]), len(trend_components[j]))
trend1 = trend_components[i][:min_length]
trend2 = trend_components[j][:min_length]
# Calculate correlation coefficient between the two trend components
correlation_coefficient = np.corrcoef(trend1, trend2)[0, 1]
correlation_matrix[i, j] = correlation_coefficient
# Display the correlation matrix
#print("Correlation matrix between trend components:")
#print(correlation_matrix)
min_correlations = np.min(correlation_matrix, axis=1)
# Display the list of minimum correlation coefficients
print("Minimum correlation coefficient for each observed component:")
print(min_correlations)
Minimum correlation coefficient for each observed component: [0.95690319 0.96012219 0.96164095 0.97152613 0.96509161 0.94535057 0.94535057 0.97071375 0.95360712 0.96818369]
In comparison to the above, the following lists for all stations their lowest correlation coefficient when compared to other stations for the trend component of the temperature data decomposition are seemingly much lower.
# Assuming you have multiple trend components stored in a list or dictionary
# For example, trend_components = [result1.trend, result2.trend, result3.trend]
trend_components = [decomposition_total_results_temp[x].trend.dropna() for x in decomposition_total_results_temp]
# Create an empty correlation matrix
num_trends = len(trend_components)
correlation_matrix = np.zeros((num_trends, num_trends))
# Calculate correlation coefficients pairwise
for i in range(num_trends):
for j in range(num_trends):
# Ensure both series have the same length (trim if needed)
min_length = min(len(trend_components[i]), len(trend_components[j]))
trend1 = trend_components[i][:min_length]
trend2 = trend_components[j][:min_length]
# Calculate correlation coefficient between the two trend components
correlation_coefficient = np.corrcoef(trend1, trend2)[0, 1]
correlation_matrix[i, j] = correlation_coefficient
# Display the correlation matrix
#print("Correlation matrix between trend components:")
#print(correlation_matrix)
min_correlations = np.min(correlation_matrix, axis=1)
# Display the list of minimum correlation coefficients
print("Minimum correlation coefficient for each trend component:")
print(min_correlations)
Minimum correlation coefficient for each trend component: [0.91835494 0.92283493 0.89846319 0.87345987 0.89298895 0.88175335 0.87345987 0.89161166 0.87900027 0.89577086]
And we do the same for the seasonal component.
# Assuming you have multiple trend components stored in a list or dictionary
# For example, trend_components = [result1.trend, result2.trend, result3.trend]
trend_components = [decomposition_total_results_temp[x].seasonal.dropna() for x in decomposition_total_results_temp]
# Create an empty correlation matrix
num_trends = len(trend_components)
correlation_matrix = np.zeros((num_trends, num_trends))
# Calculate correlation coefficients pairwise
for i in range(num_trends):
for j in range(num_trends):
# Ensure both series have the same length (trim if needed)
min_length = min(len(trend_components[i]), len(trend_components[j]))
trend1 = trend_components[i][:min_length]
trend2 = trend_components[j][:min_length]
# Calculate correlation coefficient between the two trend components
correlation_coefficient = np.corrcoef(trend1, trend2)[0, 1]
correlation_matrix[i, j] = correlation_coefficient
# Display the correlation matrix
#print("Correlation matrix between trend components:")
#print(correlation_matrix)
min_correlations = np.min(correlation_matrix, axis=1)
# Display the list of minimum correlation coefficients
print("Minimum correlation coefficient for each seasonal component:")
print(min_correlations)
Minimum correlation coefficient for each seasonal component: [0.98035937 0.98107946 0.97850648 0.98490504 0.98388275 0.96780457 0.96780457 0.988517 0.9736636 0.98495877]
Finally, we do the same for the residual component.
# Assuming you have multiple trend components stored in a list or dictionary
# For example, trend_components = [result1.trend, result2.trend, result3.trend]
trend_components = [decomposition_total_results_temp[x].resid.dropna() for x in decomposition_total_results_temp]
# Create an empty correlation matrix
num_trends = len(trend_components)
correlation_matrix = np.zeros((num_trends, num_trends))
# Calculate correlation coefficients pairwise
for i in range(num_trends):
for j in range(num_trends):
# Ensure both series have the same length (trim if needed)
min_length = min(len(trend_components[i]), len(trend_components[j]))
trend1 = trend_components[i][:min_length]
trend2 = trend_components[j][:min_length]
# Calculate correlation coefficient between the two trend components
correlation_coefficient = np.corrcoef(trend1, trend2)[0, 1]
correlation_matrix[i, j] = correlation_coefficient
# Display the correlation matrix
#print("Correlation matrix between trend components:")
#print(correlation_matrix)
min_correlations = np.min(correlation_matrix, axis=1)
# Display the list of minimum correlation coefficients
print("Minimum correlation coefficient for each resid component:")
print(min_correlations)
Minimum correlation coefficient for each resid component: [0.83730873 0.85316752 0.86475334 0.88011756 0.85516019 0.82803819 0.82803819 0.86523234 0.84853645 0.88655726]
Visualized below are the trend components. Notice the clear correlation between the trend components!
# Assuming trend_components_dict is a dictionary containing trend components
# For example: trend_components_dict = {'Decomposition 1': result1.trend, 'Decomposition 2': result2.trend}
plt.figure(figsize=(8, 8), dpi=300)
# Plot each trend component from the dictionary with its corresponding label
for label, decomped in decomposition_total_results_temp.items():
label = label[:-4]
plt.plot(decomped.trend.dropna(), label=label, lw=1.5)
plt.style.use('fivethirtyeight')
plt.ylim(1, 12)
plt.title('Comparison of Trend Components for Temperature')
plt.xlabel('Time')
plt.ylabel('Temperature (°C)')
plt.legend(loc ='lower right')
plt.savefig('../../../images/temp_trend_comparison.png')
Everything below is experimental.
# Plot the decomposed components
result = decomposition_total_results_vapor_pressure[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=0.7)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=0.7)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=0.7)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=0.7)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_winter_2023_results_vapor_pressure[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=1.5)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1.5)
#result.trend.to_csv('trend.csv')
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1.5)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1.5)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_total_results_relative_humidity[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=0.7)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=0.7)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=0.7)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=0.7)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_winter_2023_results_relative_humidity[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=1.5)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1.5)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1.5)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1.5)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_total_results_precip_amount[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=0.7)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=0.7)
#result.trend.to_csv('trend.csv')
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=0.7)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=0.7)
ax4.set_ylabel('Residual')
plt.tight_layout()
plt.savefig('../../../images/decomp_year_precip.png')
# Plot the decomposed components
result = decomposition_winter_2023_results_precip_amount[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=1.5)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1.5)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1.5)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1.5)
ax4.set_ylabel('Residual')
plt.tight_layout()
plt.savefig('../../../images/decomp_day_precip.png')
# Plot the decomposed components
result = decomposition_total_results_station_pressure[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=0.7)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=0.7)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=0.7)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=0.7)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_winter_2023_results_station_pressure[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=1.5)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1.5)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1.5)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1.5)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_total_results_wind_speed[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=0.7)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=0.7)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=0.7)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=0.7)
ax4.set_ylabel('Residual')
plt.tight_layout()
# Plot the decomposed components
result = decomposition_winter_2023_results_wind_speed[file_name_keys[0]]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8))
result.observed.plot(ax=ax1, lw=1.5)
ax1.set_ylabel('Observed')
result.trend.plot(ax=ax2, lw=1.5)
ax2.set_ylabel('Trend')
result.seasonal.plot(ax=ax3, lw=1.5)
ax3.set_ylabel('Seasonal')
result.resid.plot(ax=ax4, lw=1.5)
ax4.set_ylabel('Residual')
plt.tight_layout()
Based on the above cells, the only ones that are really worth decomposing are temp and potentially RH and vapor pressure. Let's try to run autocorrelation analysis on temp, RH and vapor pressure.
total_data_autocorr = {}
winter_2023_data_autocorr = {}
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
# Load the CSV file
data = pd.read_csv(file_path, parse_dates=['UTC_DATE'], index_col='UTC_DATE')
subset = data[(data.index >= start_date_winter_2023) & (data.index <= end_date_winter_2023)]
total_data_autocorr[file_name] = data
winter_2023_data_autocorr[file_name] = subset
# Compute autocorrelation using Pandas' autocorr function
autocorr_values = total_data_autocorr['BARRIE-ORO.csv']['TEMP'].autocorr(lag=8760)
# Visualize autocorrelation function
plt.figure(figsize=(10, 6))
pd.plotting.autocorrelation_plot(total_data_autocorr['BARRIE-ORO.csv']['TEMP'])
plt.title('Autocorrelation of Temperature Data, Annual Period')
plt.xlabel('Lag (Hours/Days/Months)')
plt.ylabel('Autocorrelation')
plt.grid(True)
# Compute autocorrelation using Pandas' autocorr function
autocorr_values = winter_2023_data_autocorr['BARRIE-ORO.csv']['TEMP'].autocorr(lag=24)
# Visualize autocorrelation function
plt.figure(figsize=(10, 6))
pd.plotting.autocorrelation_plot(winter_2023_data_autocorr['BARRIE-ORO.csv']['TEMP'])
plt.title('Autocorrelation of Temperature Data, Daily Period')
plt.xlabel('Lag (Hours/Days/Months)')
plt.ylabel('Autocorrelation')
plt.grid(True)
# Compute autocorrelation using Pandas' autocorr function
autocorr_values = total_data_autocorr['BARRIE-ORO.csv']['RELATIVE_HUMIDITY'].autocorr(lag=8760)
# Visualize autocorrelation function
plt.figure(figsize=(10, 6))
pd.plotting.autocorrelation_plot(total_data_autocorr['BARRIE-ORO.csv']['RELATIVE_HUMIDITY'])
plt.title('Autocorrelation of RH Data, Annual Period')
plt.xlabel('Lag (Hours/Days/Months)')
plt.ylabel('Autocorrelation')
plt.grid(True)
# Compute autocorrelation using Pandas' autocorr function
autocorr_values = total_data_autocorr['BARRIE-ORO.csv']['VAPOR_PRESSURE'].autocorr(lag=8760)
# Visualize autocorrelation function
plt.figure(figsize=(10, 6))
pd.plotting.autocorrelation_plot(total_data_autocorr['BARRIE-ORO.csv']['VAPOR_PRESSURE'])
plt.title('Autocorrelation of Vapor Pressure Data, Annual Period')
plt.xlabel('Lag (Hours/Days/Months)')
plt.ylabel('Autocorrelation')
plt.grid(True)
# Compute autocorrelation using Pandas' autocorr function
autocorr_values = winter_2023_data_autocorr['BARRIE-ORO.csv']['VAPOR_PRESSURE'].autocorr(lag=24)
# Visualize autocorrelation function
plt.figure(figsize=(10, 6))
pd.plotting.autocorrelation_plot(winter_2023_data_autocorr['BARRIE-ORO.csv']['VAPOR_PRESSURE'])
plt.title('Autocorrelation of Vapor Pressure Data, Daily Period For Fun')
plt.xlabel('Lag (Hours/Days/Months)')
plt.ylabel('Autocorrelation')
plt.grid(True)
total_data_lag = {}
winter_2023_data_lag = {}
for file_name in os.listdir(folder_path):
if file_name.endswith('.csv'):
file_path = os.path.join(folder_path, file_name)
# Load the CSV file
data = pd.read_csv(file_path, parse_dates=['UTC_DATE'], index_col='UTC_DATE')
subset = data[(data.index >= start_date_winter_2023) & (data.index <= end_date_winter_2023)]
total_data_lag[file_name] = data
winter_2023_data_lag[file_name] = subset
# Lag the water vapor data by a specified number of time steps (e.g., 1 time step)
lag = 1
total_data_lag['BARRIE-ORO.csv']['VAPOR_PRESSURE_LAGGED'] = total_data_lag['BARRIE-ORO.csv']['VAPOR_PRESSURE'].shift(lag)
#print(total_data_lag['BARRIE-ORO.csv'])
# Remove NaN values resulting from the lag operation
total_data_lag['BARRIE-ORO.csv'].dropna(inplace=True)
# Calculate Pearson correlation coefficient between lagged water vapor and precipitation
correlation = np.corrcoef(total_data_lag['BARRIE-ORO.csv']['PRECIP_AMOUNT'], total_data_lag['BARRIE-ORO.csv']['VAPOR_PRESSURE_LAGGED'])[0, 1]
print(f"Pearson's Correlation Coefficient Lagged: {correlation}")
# Calculate Pearson correlation coefficient between lagged water vapor and precipitation
correlation = np.corrcoef(total_data_lag['BARRIE-ORO.csv']['PRECIP_AMOUNT'], total_data_lag['BARRIE-ORO.csv']['VAPOR_PRESSURE'])[0, 1]
print(f"Pearson's Correlation Coefficient Not Lagged: {correlation}")
Pearson's Correlation Coefficient Lagged: 0.08094125568417582 Pearson's Correlation Coefficient Not Lagged: 0.0833091374756897
# Lag the water vapor data by a specified number of time steps (e.g., 1 time step)
lag = -1
working = total_data_lag['OSHAWA.csv']
working['PRECIP_AMOUNT_LAGGED'] = working['PRECIP_AMOUNT'].shift(lag)
# Remove NaN values resulting from the lag operation
working.dropna(inplace=True)
merged_data = pd.merge(working[['PRECIP_AMOUNT_LAGGED']], total_data_lag['TORONTO CITY.csv'][['PRECIP_AMOUNT']], left_index=True, right_index=True)
# Calculate Pearson correlation coefficient between lagged water vapor and precipitation
correlation = np.corrcoef(merged_data['PRECIP_AMOUNT_LAGGED'], merged_data['PRECIP_AMOUNT'])[0, 1]
print(f"Pearson's Correlation Coefficient: {correlation}")
Pearson's Correlation Coefficient: 0.374451810478636
# Lag the wind direction data by a specified number of time steps (e.g., 1 time step)
lag = 1
total_data_lag['BARRIE-ORO.csv']['WIND_DIRECTION_LAGGED'] = data['WIND_DIRECTION'].shift(lag) * 10
# Remove NaN values resulting from the lag operation
total_data_lag['BARRIE-ORO.csv'].dropna(inplace=True)
# Calculate circular correlation manually using trigonometric functions
direction_rad = np.deg2rad(total_data_lag['BARRIE-ORO.csv']['WIND_DIRECTION'])
lagged_direction_rad = np.deg2rad(total_data_lag['BARRIE-ORO.csv']['WIND_DIRECTION_LAGGED'])
circular_corr = np.cos(direction_rad - lagged_direction_rad).mean()
print(f"Manual Circular Correlation: {circular_corr}")
Manual Circular Correlation: -0.10890790447407833
Everything below this is highly experimental EDA.
# Create traces for each trend component
traces = []
for i, trend in enumerate(trend_components):
trace = go.Scatter(
x=trend.index,
y=trend.values,
mode='lines',
name=f'Trend {i+1}'
)
traces.append(trace)
# Create the layout for the plot
layout = go.Layout(
title='Trend Lines Comparison',
xaxis=dict(title='Date'),
yaxis=dict(title='Temperature')
)
# Create the figure and plot the traces
fig = go.Figure(data=traces, layout=layout)
# Display the interactive plot in the notebook (or in an HTML file)
pyo.iplot(fig)